@@ -23,14 +23,16 @@ module Agents |
||
23 | 23 |
|
24 | 24 |
To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes. |
25 | 25 |
|
26 |
- When parsing HTML or XML, these sub-hashes specify how to extract with either a `css` CSS selector or a `xpath` XPath expression and either `"text": true` or `attr` pointing to an attribute name to grab. An example: |
|
26 |
+ When parsing HTML or XML, these sub-hashes specify how each extraction should be done. The Agent first selects a node set from the document for each extraction key by evaluating either a CSS selector in `css` or an XPath expression in `xpath`. It then evaluates an XPath expression in `value` on each node in the node set, converting the result into string. Here's an example: |
|
27 | 27 |
|
28 | 28 |
"extract": { |
29 |
- "url": { "css": "#comic img", "attr": "src" }, |
|
30 |
- "title": { "css": "#comic img", "attr": "title" }, |
|
31 |
- "body_text": { "css": "div.main", "text": true } |
|
29 |
+ "url": { "css": "#comic img", "value": "@src" }, |
|
30 |
+ "title": { "css": "#comic img", "value": "@title" }, |
|
31 |
+ "body_text": { "css": "div.main", "value": "text()" } |
|
32 | 32 |
} |
33 | 33 |
|
34 |
+ "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and "text()" is to extract the enclosed text. You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc. |
|
35 |
+ |
|
34 | 36 |
When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about. For example: |
35 | 37 |
|
36 | 38 |
"extract": { |
@@ -70,9 +72,9 @@ module Agents |
||
70 | 72 |
'type' => "html", |
71 | 73 |
'mode' => "on_change", |
72 | 74 |
'extract' => { |
73 |
- 'url' => { 'css' => "#comic img", 'attr' => "src" }, |
|
74 |
- 'title' => { 'css' => "#comic img", 'attr' => "alt" }, |
|
75 |
- 'hovertext' => { 'css' => "#comic img", 'attr' => "title" } |
|
75 |
+ 'url' => { 'css' => "#comic img", 'value' => "@src" }, |
|
76 |
+ 'title' => { 'css' => "#comic img", 'value' => "@alt" }, |
|
77 |
+ 'hovertext' => { 'css' => "#comic img", 'value' => "@title" } |
|
76 | 78 |
} |
77 | 79 |
} |
78 | 80 |
end |
@@ -157,14 +159,11 @@ module Agents |
||
157 | 159 |
return |
158 | 160 |
end |
159 | 161 |
result = nodes.map { |node| |
160 |
- if extraction_details['attr'] |
|
161 |
- node.attr(extraction_details['attr']) |
|
162 |
- elsif extraction_details['text'] |
|
163 |
- node.text() |
|
164 |
- else |
|
165 |
- error '"attr" or "text" is required on HTML or XML extraction patterns' |
|
166 |
- return |
|
162 |
+ value, = node.xpath(extraction_details['value']) |
|
163 |
+ if value.is_a?(Float) && value.to_i == value |
|
164 |
+ value = value.to_i |
|
167 | 165 |
end |
166 |
+ value.to_s |
|
168 | 167 |
} |
169 | 168 |
log "Extracting #{extraction_type} at #{xpath || css}: #{result}" |
170 | 169 |
end |
@@ -0,0 +1,22 @@ |
||
1 |
+class AdoptXpathInWebsiteAgent < ActiveRecord::Migration |
|
2 |
+ def up |
|
3 |
+ Agent.where(type: 'Agents::WebsiteAgent').each do |agent| |
|
4 |
+ next if agent.extraction_type == 'json' |
|
5 |
+ |
|
6 |
+ agent.options_will_change! |
|
7 |
+ agent.options['extract'].each { |name, extraction| |
|
8 |
+ case |
|
9 |
+ when extraction.delete('text') |
|
10 |
+ extraction['value'] = 'text()' |
|
11 |
+ when attr = extraction.delete('attr') |
|
12 |
+ extraction['value'] = "@#{attr}" |
|
13 |
+ end |
|
14 |
+ } |
|
15 |
+ agent.save! |
|
16 |
+ end |
|
17 |
+ end |
|
18 |
+ |
|
19 |
+ def down |
|
20 |
+ raise ActiveRecord::IrreversibleMigration, "Cannot revert this migration" |
|
21 |
+ end |
|
22 |
+end |
@@ -10,8 +10,8 @@ jane_website_agent: |
||
10 | 10 |
:expected_update_period_in_days => 2, |
11 | 11 |
:mode => :on_change, |
12 | 12 |
:extract => { |
13 |
- :title => {:css => "item title", :text => true}, |
|
14 |
- :url => {:css => "item link", :text => true} |
|
13 |
+ :title => {:css => "item title", :value => 'text()'}, |
|
14 |
+ :url => {:css => "item link", :value => 'text()'} |
|
15 | 15 |
} |
16 | 16 |
}.to_json.inspect %> |
17 | 17 |
|
@@ -27,8 +27,8 @@ bob_website_agent: |
||
27 | 27 |
:expected_update_period_in_days => 2, |
28 | 28 |
:mode => :on_change, |
29 | 29 |
:extract => { |
30 |
- :url => {:css => "#comic img", :attr => "src"}, |
|
31 |
- :title => {:css => "#comic img", :attr => "title"} |
|
30 |
+ :url => {:css => "#comic img", :value => "@src"}, |
|
31 |
+ :title => {:css => "#comic img", :value => "@title"} |
|
32 | 32 |
} |
33 | 33 |
}.to_json.inspect %> |
34 | 34 |
|
@@ -768,8 +768,8 @@ describe AgentDrop do |
||
768 | 768 |
url: 'http://dilbert.com/', |
769 | 769 |
mode: 'on_change', |
770 | 770 |
extract: { |
771 |
- url: { css: '[id^=strip_enlarged_] img', attr: 'src' }, |
|
772 |
- title: { css: '.STR_DateStrip', text: true }, |
|
771 |
+ url: { css: '[id^=strip_enlarged_] img', value: '@src' }, |
|
772 |
+ title: { css: '.STR_DateStrip', value: 'text()' }, |
|
773 | 773 |
}, |
774 | 774 |
}, |
775 | 775 |
schedule: 'every_12h', |
@@ -11,9 +11,9 @@ describe Agents::WebsiteAgent do |
||
11 | 11 |
'url' => "http://xkcd.com", |
12 | 12 |
'mode' => 'on_change', |
13 | 13 |
'extract' => { |
14 |
- 'url' => { 'css' => "#comic img", 'attr' => "src" }, |
|
15 |
- 'title' => { 'css' => "#comic img", 'attr' => "alt" }, |
|
16 |
- 'hovertext' => { 'css' => "#comic img", 'attr' => "title" } |
|
14 |
+ 'url' => { 'css' => "#comic img", 'value' => "@src" }, |
|
15 |
+ 'title' => { 'css' => "#comic img", 'value' => "@alt" }, |
|
16 |
+ 'hovertext' => { 'css' => "#comic img", 'value' => "@title" } |
|
17 | 17 |
} |
18 | 18 |
} |
19 | 19 |
@checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @valid_options, :keep_events_for => 2) |
@@ -256,8 +256,8 @@ describe Agents::WebsiteAgent do |
||
256 | 256 |
'url' => "http://xkcd.com", |
257 | 257 |
'mode' => "on_change", |
258 | 258 |
'extract' => { |
259 |
- 'url' => {'css' => "#topLeft a", 'attr' => "href"}, |
|
260 |
- 'title' => {'css' => "#topLeft a", 'text' => "true"} |
|
259 |
+ 'url' => {'css' => "#topLeft a", 'value' => "@href"}, |
|
260 |
+ 'title' => {'css' => "#topLeft a", 'value' => "text()"} |
|
261 | 261 |
} |
262 | 262 |
} |
263 | 263 |
rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site) |
@@ -389,9 +389,9 @@ describe Agents::WebsiteAgent do |
||
389 | 389 |
'url' => "http://www.example.com", |
390 | 390 |
'mode' => 'on_change', |
391 | 391 |
'extract' => { |
392 |
- 'url' => { 'css' => "#comic img", 'attr' => "src" }, |
|
393 |
- 'title' => { 'css' => "#comic img", 'attr' => "alt" }, |
|
394 |
- 'hovertext' => { 'css' => "#comic img", 'attr' => "title" } |
|
392 |
+ 'url' => { 'css' => "#comic img", 'value' => "@src" }, |
|
393 |
+ 'title' => { 'css' => "#comic img", 'value' => "@alt" }, |
|
394 |
+ 'hovertext' => { 'css' => "#comic img", 'value' => "@title" } |
|
395 | 395 |
}, |
396 | 396 |
'basic_auth' => "user:pass" |
397 | 397 |
} |
@@ -421,7 +421,7 @@ describe Agents::WebsiteAgent do |
||
421 | 421 |
'mode' => 'on_change', |
422 | 422 |
'headers' => { 'foo' => 'bar' }, |
423 | 423 |
'extract' => { |
424 |
- 'url' => { 'css' => "#comic img", 'attr' => "src" }, |
|
424 |
+ 'url' => { 'css' => "#comic img", 'value' => "@src" }, |
|
425 | 425 |
} |
426 | 426 |
} |
427 | 427 |
@checker = Agents::WebsiteAgent.new(:name => "ua", :options => @valid_options) |